In [None]:
BRANCH = 'main'

"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""

In [None]:
import os
# either provide a path to local NeMo repository with NeMo already installed or git clone

# option #1: local path to NeMo repo with NeMo already installed
NEMO_DIR_PATH = os.path.dirname(os.path.dirname(os.path.abspath('')))
is_colab = False

# option #2: download NeMo repo
if 'google.colab' in str(get_ipython()) or not os.path.exists(os.path.join(NEMO_DIR_PATH, "nemo")):
    ## Install dependencies
    !apt-get install sox libsndfile1 ffmpeg

    !git clone -b $BRANCH https://github.com/NVIDIA/NeMo
    %cd NeMo
    !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
    NEMO_DIR_PATH = os.path.abspath('')
    is_colab = True

import sys
sys.path.insert(0, NEMO_DIR_PATH)

# 1. Introduction to ASR confidence estimation
Confidence estimation is a crucial yet sometimes overlooked aspect of automatic speech recognition (ASR) systems. Confidence estimation for ASR is the process of estimating the rate of reliability of the output generated by an ASR system. For an output transcription, confidence estimation answers the question "how accurate this transcription is", or "how likely this  transcription is correct".

Confidence score is the result of confidence estimation. It lies in range from 0 to 1, where zero signals that the confidence estimator is completely unsure, and one indicates that the estimator is confident in the output. Confidence scores are often used to guide downstream processing in ASR applications. For example, in a voice dictation application, a low confidence score could trigger the system to ask the user to repeat the input or to suggest alternative transcriptions.

There are several approaches to confidence estimation in ASR, including:

1. Acoustic modeling-based methods: These methods use the acoustic model scores to estimate the confidence score. The acoustic model represents the relationship between the acoustic signal and the corresponding linguistic units, and the score reflects the similarity between the observed signal and the predicted model output. Here, the acoustic model can be the ASR model itself (non-trainable methods), or a trainable external estimator, accepting acoustic features or output probabilities and predicting confidence scores.

2. Language modeling-based methods: These methods use the language model scores to estimate the confidence score. The language model represents the probability distribution of the sequence of words, and the score reflects the likelihood of the transcription given the language model. 

3. Combination methods: These methods combine the scores from both the acoustic and language models to estimate the confidence score. This approach can leverage the strengths of both models to achieve more accurate confidence scores.

In this introductory tutorial we will cover only the non-trainable acoustic-based methods.

## 1.1. Optional resources
This tutorial is self-contained, but if you want to dive deeper into the topic, you can check out these resources:
* Paper behind this tutorial: https://arxiv.org/abs/2212.08703
* Supplementary blog on how and why confidence estimation methods of this tutorial were developed: https://developer.nvidia.com/blog/entropy-based-methods-for-word-level-asr-confidence-estimation/

# 2. Data Download
First, let's download audio and text data. Here we will use LibriSpeech *dev-other* and *test-other*.

In [None]:
## create data directory and download an audio file
WORK_DIR = 'WORK_DIR'
DATA_DIR = WORK_DIR + '/DATA'
os.makedirs(DATA_DIR, exist_ok=True)

print('downloading audio data...')
!python $NEMO_DIR_PATH/scripts/dataset_processing/get_librispeech_data.py --data_root=$DATA_DIR --data_set=test_other
!rm $DATA_DIR/test_other.tar.gz

# 3. Confidence estimation example
Let's see how confidence scores can be obtained with NeMo models.

## 3.1. Helper functions
The following functions are to pretty-print confidence scores for word-level ASR hypotheses.

In [None]:
import json
import os
from termcolor import colored
from typing import List, Optional, Tuple, Union

from IPython.display import Audio, HTML, Image, display
import numpy as np
from kaldialign import align 

def get_detailed_wer_labels(ref: List[str], hyp: List[str], return_eps_padded_hyp: bool = False):
    """Get detailed WER labels, aligning reference with hypothesis.
    
    Possible WER labels:
        - 'C' for Correct,
        - 'I' for Insertion,
        - 'D' for Deletion,
        - 'S' for Substitution.

    Returns:
        WER labels list.
        [Optional] Epsilin-padded hypothesis if return_eps_padded_hyp set to True.
    """

    # Align reference and hypothesis using "<eps>"
    alignment = align(ref, hyp, '<eps>')
    aligned_ref = [item[0] for item in alignment]
    aligned_hyp = [item[1] for item in alignment]

    # Determine labels
    labels = []
    for r, h in zip(aligned_ref, aligned_hyp):
        if r == h:
            labels.append("C")
        elif r == "<eps>":
            labels.append("I")
        elif h == "<eps>":
            labels.append("D")
        else:
            labels.append("S")

    return labels if not return_eps_padded_hyp else labels, aligned_hyp


def fill_confidence_deletions(confidence_scores: List[float], labels: List[str], fill_value: float = 0.0):
    """Fill confidence scores list with the provided value for deletions.
    Assumes that we have no natural confidence scores for deletions.
    
    Returns:
        Confidence scores list with deletion scores.
    """

    assert len(confidence_scores) <= len(labels)

    # If the lengths of confidence_scores and labels are equal, then we assume that there are no deletions
    if len(confidence_scores) == len(labels):
        return confidence_scores

    # Insert fill_value into confidence_scores where label == "D"
    new_confidence_scores = []
    score_index = 0
    for label in labels:
        if label == "D":
            new_confidence_scores.append(fill_value)
        else:
            new_confidence_scores.append(confidence_scores[score_index])
            score_index += 1
    return new_confidence_scores


def pretty_pad_word_labels(labels: List[str], words: List[str]):
    """Pad word labels with dash for pretty printing.
    Expects labels and words to have the same length.
    
    Returns:
        Padded labels list.
    """
    
    # Check that words and labels without 'D' have the same length
    assert len(words) == len(labels)

    # Pad the labels with dashes to align them with the words
    padded_labels = []
    for word, label in zip(words, labels):
        label_len = len(word)
        left_padding = (label_len - 1) // 2
        right_padding = label_len - left_padding - 1
        padded_label = "-" * left_padding + label + "-" * right_padding
        padded_labels.append(padded_label)

    return padded_labels


def _html_paint_word_grey(word: str, shade: str):
    if shade == "black":
        color = "0,0,0"
    elif shade == "grey":
        color = "150,150,150"
    elif shade == "light_grey":
        color = "200,200,200"
    else:
        raise ValueError(
            f"`shade` has to be one of the following: `black`, `grey`, `light_grey`. Provided: `{shade}`"
        )
    return f'<mark style="color:rgb({color});background-color:rgb(255,255,255);">{word}</font></mark>'


def pretty_print_transcript_with_confidence(
    transcript: str,
    confidence_scores: List[float],
    threshold: float,
    reference: Optional[str] = None,
    terminal_width: int = 120,
    html: bool = False,
):
    if html:
        shade_if_low_confidence = lambda x, y: _html_paint_word_grey(x, 'light_grey' if y < threshold else 'black')
        new_line_mark = "<br>"
        pretty_print = lambda x: display(HTML("<code>" + new_line_mark.join(x) + "</code>"))
    else:
        shade_if_low_confidence = lambda x, y: colored(x, 'light_grey') if y < threshold else x
        new_line_mark = "\n"
        pretty_print = lambda x: print(new_line_mark.join(x))
    with_labels = reference is not None
    transcript_list = transcript.split()
    output_lines = []
    if with_labels:
        reference_list = reference.split()
        labels, eps_padded_hyp = get_detailed_wer_labels(reference_list, transcript_list, True)
        padded_labels = pretty_pad_word_labels(labels, eps_padded_hyp)
        current_line_len = 0
        current_word_line = ""
        current_label_line = ""
        for word, label, padded_label, score in zip(
            eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)
        ):
            word_len = len(word)
            # shield angle brackets for <eps>
            if html and word == "<eps>":
                word = "&lt;eps&gt;"
            if current_line_len + word_len + 1 <= terminal_width:
                if current_line_len > 0:
                    current_line_len += 1
                    current_word_line += " "
                    current_label_line += "-"
                current_line_len += word_len
                current_word_line += shade_if_low_confidence(word, score)
                current_label_line += padded_label
            else:
                output_lines.append(current_word_line + new_line_mark + current_label_line)
                current_line_len = word_len
                current_word_line = shade_if_low_confidence(word, score)
                current_label_line = padded_label
        if current_word_line:
            output_lines.append(current_word_line + new_line_mark + current_label_line)
    else:
        current_line_len = 0
        current_word_line = ""
        for word, score in zip(transcript_list, confidence_scores):
            word_len = len(word)
            # shield angle brackets for <eps>
            if html and word == "<eps>":
                word = "&lt;eps&gt;"
            if current_line_len + word_len + 1 <= terminal_width:
                if current_line_len > 0:
                    current_line_len += 1
                    current_word_line += " "
                current_line_len += word_len
                current_word_line += shade_if_low_confidence(word, score)
            else:
                output_lines.append(current_word_line)
                current_line_len = word_len
                current_word_line = shade_if_low_confidence(word, score)
        if current_word_line:
            output_lines.append(current_word_line)

    pretty_print(output_lines)

## 3.2. Data and model loading
This tutorial uses CTC and RNN-T Conformer models trained on LibriSpeech.

You can try to use other pre-trained models as well.

In [None]:
from dataclasses import dataclass
from omegaconf import DictConfig, OmegaConf

from nemo.collections.asr.models import ASRModel

def load_model(name: str):
    """Load a pre-trained model.

    Args:
        name: Pre-trained model name.
            Reserved names:
            - 'ctc' for 'stt_en_conformer_ctc_large_ls'
            - 'rnnt' for 'stt_en_conformer_transducer_large_ls'

    Returns:
        A model loaded into GPU with .eval() mode set.
    """
    if name == "ctc":
        name = "stt_en_conformer_ctc_large_ls"
    elif name == "rnnt":
        name = "stt_en_conformer_transducer_large_ls"

    model = ASRModel.from_pretrained(model_name=name, map_location="cuda:0")
    model.eval()

    return model

@dataclass
class TestSet:
    filepaths: List[str]
    reference_texts: List[str]
    durations: List[float]

def load_data(manifest_path: str):
    filepaths = []
    reference_texts = []
    durations = []
    with open(manifest_path, "r") as f:
        for line in f:
            item = json.loads(line)
            audio_file = item["audio_filepath"]
            filepaths.append(str(audio_file))
            text = item["text"]
            reference_texts.append(text)
            durations.append(float(item["duration"]))
    return TestSet(filepaths, reference_texts, durations)

TEST_MANIFESTS = {
    "test_other": DATA_DIR + "/test_other.json",
}


# Load data
test_sets = {manifest: load_data(path) for manifest, path in TEST_MANIFESTS.items()}

# Load model
is_rnnt = False
# is_rnnt = True

model = load_model("rnnt" if is_rnnt else "ctc")

## 3.3. Setting up confidence estimation
To set up confidence estimation for NeMo ASR models, you need to:
1. Initialize _ConfidenceConfig_
2. Put the created _ConfidenceConfig_ into the model decoding config.

The following cell contains an example of _ConfidenceConfig_ initialization and updating the model's decoding config.

For the _ConfidenceConfig_ there are also listed possible values for its parameters.

Note that only `strategy="greedy"` (or `greedy_batch` for RNN-T) supports computing confidence scores.

In [None]:
from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
from nemo.collections.asr.parts.utils.asr_confidence_utils import (
    ConfidenceConfig,
    ConfidenceConstants,
    ConfidenceMethodConfig,
    ConfidenceMethodConstants,
)
from nemo.collections.asr.parts.utils.asr_confidence_benchmarking_utils import (
    apply_confidence_parameters,
    get_correct_marks,
    get_token_targets_with_confidence,
    get_word_targets_with_confidence,
)


# List allowed options for ConfidenceMethodConfig and ConfidenceConfig
print(f"Allowed options for ConfidenceMethodConfig: {ConfidenceMethodConstants.print()}\n")
print(f"Allowed options for ConfidenceConfig: {ConfidenceConstants.print()}\n")

# Initialize ConfidenceConfig and ConfidenceMethodConfig
confidence_cfg = ConfidenceConfig(
    preserve_frame_confidence=True, # Internally set to true if preserve_token_confidence == True
    # or preserve_word_confidence == True
    preserve_token_confidence=True, # Internally set to true if preserve_word_confidence == True
    preserve_word_confidence=True,
    aggregation="prod", # How to aggregate frame scores to token scores and token scores to word scores
    exclude_blank=False, # If true, only non-blank emissions contribute to confidence scores
    tdt_include_duration=False, # If true, calculate duration confidence for the TDT models
    method_cfg=ConfidenceMethodConfig( # Config for per-frame scores calculation (before aggregation)
        name="max_prob", # Or "entropy" (default), which usually works better
        entropy_type="gibbs", # Used only for name == "entropy". Recommended: "tsallis" (default) or "renyi"
        alpha=0.5, # Low values (<1) increase sensitivity, high values decrease sensitivity
        entropy_norm="lin" # How to normalize (map to [0,1]) entropy. Default: "exp"
    )
)

# Alternalively, look at ConfidenceConfig's docstring
print(f"More info on ConfidenceConfig here:\n{ConfidenceConfig().__doc__}\n")

# Put the created ConfidenceConfig into the model decoding config via .change_decoding_strategy()
model.change_decoding_strategy(
    RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy_batch", confidence_cfg=confidence_cfg)
    if is_rnnt
    else CTCDecodingConfig(confidence_cfg=confidence_cfg)
)

## 3.4. Decode test set and get transcriptions with confidence scores
Let's transcribe Librispeech _test-other_ and see what confidence scores are inside.

In [None]:
current_test_set = test_sets["test_other"]
transcriptions = model.transcribe(audio=current_test_set.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)
if is_rnnt:
    transcriptions = transcriptions[0]

For a transcribed hypothesis, there can be `frame_confidence` and aggregated from them `token_confidence` and `word_confidence`.

In [None]:
def round_confidence(confidence_number, ndigits=3):
    if isinstance(confidence_number, float):
        return round(confidence_number, ndigits)
    elif len(confidence_number.size()) == 0:  # torch.tensor with one element
        return round(confidence_number.item(), ndigits)
    elif len(confidence_number.size()) == 1:  # torch.tensor with a list if elements
        return [round(c.item(), ndigits) for c in confidence_number]
    else:
        raise RuntimeError(f"Unexpected confidence_number: `{confidence_number}`")


tran = transcriptions[0]
print(
    f"""    Recognized text: `{tran.text}`\n
    Word confidence: {[round_confidence(c) for c in tran.word_confidence]}\n
    Token confidence: {[round_confidence(c) for c in tran.token_confidence]}\n
    Frame confidence: {
        [([round_confidence(cc) for cc in c] if is_rnnt else round_confidence(c)) for c in tran.frame_confidence]
    }"""
)

Now let's draw the recognition results highlighted according to their confidence scores.

There are four options: plain text and HTML with or without WER labels.

In [None]:
from nemo.collections.asr.metrics.wer import word_error_rate, word_error_rate_detail, word_error_rate_per_utt

def show_dataset_with_confidence(
    indices,
    transcriptions,
    test_set,
    threshold,
    filepaths=None,
    html_show=False,
    min_dur_to_show=0.0,
    utt_to_show=10
):
    utt_shown = 0
    for i, _ in indices:
        if utt_shown >= utt_to_show:
            break
        if test_set.durations[i] >= min_dur_to_show:
            print("="*120)
            hyp = transcriptions[i].text
            scores = transcriptions[i].word_confidence
            ref = test_set.reference_texts[i]
            pretty_print_transcript_with_confidence(hyp, scores, threshold, ref, html=html_show)
            if filepaths is not None:
                display(Audio(filepaths[i]))
            utt_shown += 1


# you can play with these parameters
threshold = 0.52
# in colab, you may want to use `html_show = True` as non-html colorion displayed incorrectly in colab
html_show = is_colab
min_dur_to_show = 4.0
utt_to_show = 5

wer_per_utt, avg_wer = word_error_rate_per_utt([h.text for h in transcriptions], current_test_set.reference_texts)
sorted_wer_indices = sorted(enumerate(wer_per_utt), key=lambda x: x[1])[::-1]

show_dataset_with_confidence(
    indices=sorted_wer_indices,
    transcriptions=transcriptions,
    test_set=current_test_set,
    threshold=threshold,
    filepaths=current_test_set.filepaths,
    html_show=html_show,
    min_dur_to_show=min_dur_to_show,
    utt_to_show=utt_to_show
)

## 3.5. Confidence metrics

There are several metrics to evaluate the effectiveness of a confidence estimation method. Some of them consider confidence estimation as a binary classification task. Other measure how close the correct word confidence scores are to $1.0$ and the incorrect word scores are to $0.0$.

Some of them are:
1. Area Under the Receiver Operating Characteristics Curve ($\mathrm{AUC}_\mathrm{ROC}$): class separability metric.
2. Area Under the Precision-Recall Curve ($\mathrm{AUC}_\mathrm{PR}$): how well the correct words are detected.
3. Area Under the Negative Predictive Value vs. True Negative Rate Curve ($\mathrm{AUC}_\mathrm{NT}$): how well the incorrect words are detected ($\mathrm{AUC}_\mathrm{PR}$ in which errors are treated as positives).
4. Normalized Cross Entropy ($\mathrm{NCE}$): how close of confidence for correct predictions to $1.0$ and of incorrect predictions to $0.0$. It ranges from $-\infty$ to $1.0$, with negative scores indicating that the conﬁdence method performs worse than the setting confidence score to $1-\mathrm{WER}$. This metric is also known as Normalized Mutual Information.
5. Expected Calibration Error ($\mathrm{ECE}$): a weighted average over the absolute accuracy/confidence difference. It ranges from $0.0$ to $1.0$ with the best value $0.0$.

Metrics based on the Youden's curve (see https://en.wikipedia.org/wiki/Youden%27s_J_statistic) can also be considered. They are:
1. Area Under the Youden's curve ($\mathrm{AUC}_\mathrm{YC}$): the rate of the effective threshold range (i.e. the adjustability or responsiveness). It ranges from $0.0$ to $1.0$ with the best value $0.5$.
2. Maximum of the Youden's curve $\mathrm{MAX}_\mathrm{YC}$: the optimal $\mathrm{TNR}$ vs. $\mathrm{FNR}$ tradeoff. It's unnormalized version can be used as a criterion for selecting the optimal $\tau$. It ranges from $0.0$ to $1.0$ with the best value $1.0$.
3. The standard deviation of the Youden's curve values ($\mathrm{STD}_\mathrm{YC}$): indicates that $\mathrm{TNR}$ and $\mathrm{FNR}$ increase at different rates (viz. $\mathrm{TNR}$ grows faster) as the $\tau$ increases. It ranges from $0.0$ to $0.5$ with the best value around $0.25$.

When selecting/tuning a confidence method, it is recommended to maximize $\mathrm{AUC}_\mathrm{ROC}$ first as this is the main metric of confidence estimation quality. Then, for overconfident models, maximizing $\mathrm{AUC}_\mathrm{NT}$ should take precedence over $\mathrm{AUC}_\mathrm{PR}$. Finally, a trade-off between $\mathrm{NCE}$/$\mathrm{ECE}$ and the family of $\mathrm{YC}$ metrics considered as a compromise between formal correctness and controllability.

Let's see how well our confidence performs according to the metrics above.

In [None]:
from nemo.collections.asr.parts.utils.confidence_metrics import (
    auc_nt,
    auc_pr,
    auc_roc,
    auc_yc,
    ece,
    nce,
    save_confidence_hist,
    save_custom_confidence_curve,
    save_nt_curve,
    save_pr_curve,
    save_roc_curve,
)


targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]
correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(current_test_set.reference_texts, transcriptions)]

y_true, y_score = np.array(
    [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]
).T


# output scheme: yc.mean(), yc.max(), yc.std() or yc.mean(), yc.max(), yc.std(), (thresholds, yc)
result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=True)
# output scheme: ece or ece, (thresholds, ece_curve)
results_ece = ece(y_true, y_score, return_curve=True)
results = [
    auc_roc(y_true, y_score),
    auc_pr(y_true, y_score),
    auc_nt(y_true, y_score),
    nce(y_true, y_score),
    results_ece[0],
] + list(result_yc[:3])

print(
    f"""    AUC_ROC:\t{results[0]:.5f}
    AUC_PR:\t{results[1]:.5f}
    AUC_NT:\t{results[2]:.5f}
    NCE:\t{results[3]:.5f}
    ECE:\t{results[4]:.5f}
    AUC_YC:\t{results[5]:.5f}
    MAX_YC:\t{results[7]:.5f}
    STD_YC:\t{results[6]:.5f}
    """
)

Confidence metrics for the maximum probability confidence are not that great.

Let's re-run and benchmark confidence estimation with the default confidence estimator.

In [None]:
confidence_cfg = ConfidenceConfig(
    preserve_word_confidence=True,
    preserve_token_confidence=True,
)

model.change_decoding_strategy(
    RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy_batch", confidence_cfg=confidence_cfg)
    if is_rnnt
    else CTCDecodingConfig(confidence_cfg=confidence_cfg)
)

transcriptions = model.transcribe(audio=current_test_set.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)
if is_rnnt:
    transcriptions = transcriptions[0]

In [None]:
targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]
correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(current_test_set.reference_texts, transcriptions)]

y_true, y_score = np.array(
    [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]
).T

result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=True)
results_ece = ece(y_true, y_score, return_curve=True)
results = [
    auc_roc(y_true, y_score),
    auc_pr(y_true, y_score),
    auc_nt(y_true, y_score),
    nce(y_true, y_score),
    results_ece[0],
] + list(result_yc[:3])

print(
    f"""    AUC_ROC:\t{results[0]:.5f}
    AUC_PR:\t{results[1]:.5f}
    AUC_NT:\t{results[2]:.5f}
    NCE:\t{results[3]:.5f}
    ECE:\t{results[4]:.5f}
    AUC_YC:\t{results[5]:.5f}
    MAX_YC:\t{results[7]:.5f}
    STD_YC:\t{results[6]:.5f}
    """
)

Note that despite the overall improvement, $NCE$ and $ECE$ have gotten worse. This is due to class imbalance caused by low WER.

Now, let's draw $\mathrm{ROC}$ as well as histograms of correctly and incorrectly recognized words.

In [None]:
from tempfile import TemporaryDirectory


plot_dir = TemporaryDirectory()
os.makedirs(plot_dir.name, exist_ok=True)

mask_correct = y_true == 1
y_score_correct = y_score[mask_correct]
y_score_incorrect = y_score[~mask_correct]

# histogram of the correct distribution
save_confidence_hist(y_score_correct, plot_dir.name, "hist_correct")
# histogram of the incorrect distribution
save_confidence_hist(y_score_incorrect, plot_dir.name, "hist_incorrect")
# AUC-ROC curve
save_roc_curve(y_true, y_score, plot_dir.name, "roc")


display(
    Image(filename=os.path.join(plot_dir.name, "hist_correct.png"), retina=True),
    Image(filename=os.path.join(plot_dir.name, "hist_incorrect.png"), retina=True),
    Image(filename=os.path.join(plot_dir.name, "roc.png"), retina=True),
)

Optionally, you can look at curves for other metrics ($\mathrm{PR}$, $\mathrm{NT}$, $\mathrm{ECE}$, and $\mathrm{YC}$).

In [None]:
# AUC-PR curve
save_pr_curve(y_true, y_score, plot_dir.name, "pr")
# AUC-NT curve
save_nt_curve(y_true, y_score, plot_dir.name, "nt")
# ECE curve
ece_thresholds, ece_values = results_ece[-1]
ece_values /= max(ece_values)
save_custom_confidence_curve(
    ece_thresholds, ece_values, plot_dir.name, "ece", "Threshold", "|Accuracy − Confidence score|"
)
# AUC-YC curve
yc_thresholds, yc_values = result_yc[-1]
save_custom_confidence_curve(
    yc_thresholds, yc_values, plot_dir.name, "yc", "Threshold", "True positive rate − False Positive Rate"
)


display(
    Image(filename=os.path.join(plot_dir.name, "pr.png"), retina=True),
    Image(filename=os.path.join(plot_dir.name, "nt.png"), retina=True),
    Image(filename=os.path.join(plot_dir.name, "ece.png"), retina=True),
    Image(filename=os.path.join(plot_dir.name, "yc.png"), retina=True),
)

You can use `scripts/speech_recognition/confidence/benchmark_asr_confidence.py` to find optimal confidence hyperparameters.

# 4. Confidence applications

## 4.1. Small WER improvement

Good confidence scores can slightly reduce WER by removing low confidence words from recognition results.

Consider the following example.

Let's look at the detailed WER of the transcribed test set before and after removing words with low confidence score.

In [None]:
drop_low_confidence_words = lambda x, y, z: " ".join([xx for xx, yy in zip(x.split(), y) if yy >= z])


threshold = 0.001

wer_initial = word_error_rate_detail([h.text for h in transcriptions], current_test_set.reference_texts)
print(
    f"""WER detail before removing low confidence words:
    WER:\t{wer_initial[0]:.5f}
    INS_rate:\t{wer_initial[2]:.5f}
    DEL_rate:\t{wer_initial[3]:.5f}
    SUB_rate:\t{wer_initial[4]:.5f}"""
)

wer_conf_dropped = word_error_rate_detail(
    [drop_low_confidence_words(hyp.text, hyp.word_confidence, threshold) for hyp in transcriptions],
    current_test_set.reference_texts,
)
print(
    f"""WER detail after removing low confidence words:
    WER:\t{wer_conf_dropped[0]:.5f}
    INS_rate:\t{wer_conf_dropped[2]:.5f}
    DEL_rate:\t{wer_conf_dropped[3]:.5f}
    SUB_rate:\t{wer_conf_dropped[4]:.5f}"""
)

You can see that with the right (in this example, extremely low) `threshold` can reduce WER by a tiny bit, reducing insertions and substitutions yet increasing deletions.

Now let's see how to find the optimal threshold.

The most commonly used method for automatically determining the optimal cutoff threshold is taking the value which delivers the maximum of the unnormalized Youden's curve. This method allows you to remove the largest number of incorrect entities, sacrificing the minimum number of correct entities.

However, the unnormalized $\mathrm{MAX}_\mathrm{YC}$ method does not work well for the purpose of the WER reduction. Let's compare this method to explicitly minimizing WER with respect to a threshold.

In [None]:
from joblib import Parallel, delayed
from multiprocessing import cpu_count
from tqdm.notebook import tqdm

def max_unnnormalized_yc(
    y_true: Union[List[int], np.ndarray],
    y_score: Union[List[float], np.ndarray],
    n_bins: int = 100,
    start: float = 0.0,
    stop: float = 1.0,
):
    """Calculate the maximum of the unnormalized Youden's curve.
    """
    y_true = np.array(y_true)
    y_score = np.array(y_score)
    thresholds = np.linspace(start, stop, n_bins + 1)
    assert len(y_true) == len(y_score)
    assert np.all(y_true >= 0) and np.all(y_true <= 1)
    if np.all(y_true == 0) or np.all(y_true == 1):
        return 0.0, 0.0
    mask_correct = y_true == 1
    y_score_correct = y_score[mask_correct]
    y_score_incorrect = y_score[~mask_correct]
    unnnormalized_yc = []
    for threshold in thresholds:
        tn = len((y_score_incorrect < threshold).nonzero()[0])
        fn = len((y_score_correct < threshold).nonzero()[0])
        unnnormalized_yc.append((threshold, tn - fn))
    return max(unnnormalized_yc, key=lambda x: x[1])[0]


def min_wer(ref: List[str], transcriptions, n_bins: int = 100, start: float = 0.0, stop: float = 1.0):
    """Find the threshold value that delivers the minimum WER.
    """
    thresholds = np.linspace(start, stop, n_bins + 1)
    hyp = [(hyp.text, hyp.word_confidence) for hyp in transcriptions]
    _get_wer = lambda x, y, z: (x, word_error_rate_detail([drop_low_confidence_words(yy[0], yy[1], x) for yy in y], z)[0])
    wers = Parallel(n_jobs=cpu_count())(delayed(_get_wer)(threshold, hyp, ref) for threshold in tqdm(thresholds))
    return min(wers, key=lambda x: x[1])


targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]
correct_marks = [
    get_correct_marks(r.split(), h.words) for r, h in zip(current_test_set.reference_texts, transcriptions)
]
y_true, y_score = np.array(
    [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]
).T

threshold_yc = max_unnnormalized_yc(y_true, y_score)
yc_wer_value = word_error_rate(
    [drop_low_confidence_words(hyp.text, hyp.word_confidence, threshold_yc) for hyp in transcriptions],
    current_test_set.reference_texts,
)
threshold_min_wer, min_wer_value = min_wer(current_test_set.reference_texts, transcriptions, stop=0.1)

print(
    f"""    Initial WER: {wer_initial[0]:.5f}
    Optimal threshold and WER based on the Youden's curve: {threshold_yc}, {yc_wer_value:.5f}
    Optimal threshold for the minimum WER: {threshold_min_wer}, {min_wer_value:.5f}
    """
)

As you can see, the optimal cutoff threshold as the maximum of the Youden's curve makes WER significantly worse, and the optimal threshold for the minimum WER is near zero.

Let's use a different confidence estimation setup to see if we can improve WER at least a bit further.

In [None]:
confidence_cfg = ConfidenceConfig(
    preserve_word_confidence=True,
    preserve_token_confidence=True,
    aggregation="min",
    method_cfg=DictConfig({"entropy_type": "tsallis", "alpha": 1.5, "entropy_norm": "lin"}),
)

model.change_decoding_strategy(
    RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy_batch", confidence_cfg=confidence_cfg)
    if is_rnnt
    else CTCDecodingConfig(confidence_cfg=confidence_cfg)
)

transcriptions = model.transcribe(audio=current_test_set.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)
if is_rnnt:
    transcriptions = transcriptions[0]

threshold_min_wer, min_wer_value = min_wer(current_test_set.reference_texts, transcriptions)

print(
    f"""    Initial WER: {wer_initial[0]:.5f}
    Optimal threshold for the minimum WER: {threshold_min_wer}, {min_wer_value:.5f}
    """
)

Overall, such an improvement in WER is too small to be considered. However, this opens up the possibility of improving WER through the use of more accurate confidence estimation methods.

## 4.2. Reducing hallucinations with confidence scores

One common application of confidence scores is the removal of recognition hallucinations.

Let's see how this can be done.

Firstly, let's obtain a dataset on which the ASR model can hallucinate.

Here we make it from the librosa examples, reversing them and convolving with each other.

In [None]:
from itertools import combinations
import json
import librosa
import soundfile as sf

def cyclic_sum(x, y):
    if x.shape[0] < y.shape[0]:
        x, y = y, x
    if x.shape[0] > y.shape[0]:
        y = np.take(y, range(0, x.shape[0]), mode='wrap')
    return x + y

def generate_noise_examples(example_list: List[str], save_dir: str, samplerate: int = 16000):
    """Generate noise examples with librosa.
    It loads the selected example, inverts and perturbs them with each other.

    Returns:
        A manifest with the noise wavs.
    """
    samples = {ex: librosa.core.load(librosa.util.example(key=ex, hq=True), sr=samplerate)[0] 
               for ex in example_list}
    noise_samples = {"_".join([left, right]): cyclic_sum(samples[left][::-1], samples[right][::-1]) 
                     for left, right in combinations(samples.keys(), 2)}

    os.makedirs(save_dir, exist_ok=True)
    manifest = os.path.join(save_dir, "manifest.json")
    with open(manifest, "tw", encoding="utf-8") as fout:
        for k, v in noise_samples.items():
            audio_path = os.path.join(save_dir, f"{k}.wav")
            sf.write(audio_path, v, samplerate=samplerate)
            metadata = {
                "audio_filepath": audio_path,
                "duration": librosa.core.get_duration(y=v, sr=samplerate),
                "label": "noise",
                "text": "_"
            }
            json.dump(metadata, fout)
            fout.write('\n')

    return manifest

librosa_list_examples = ['brahms',
                         'choice',
                         'fishin',
                         'humpback',
                         'libri1',
                         'libri2',
                         'libri3',
                         'nutcracker',
                         'pistachio',
                         'robin',
                         'sweetwaltz',
                         'trumpet',
                         'vibeace']
sr = 16000

noise_dir = os.path.join(DATA_DIR, "noise")
noise_manifest = generate_noise_examples(librosa_list_examples, noise_dir, sr)

The original examples contain speech, music, or noise. The resulting audio recordings are considered to contain no recognizable speech.

You can listen to an example of the audios.

In [None]:
noise_data = load_data(noise_manifest)

display(Audio(noise_data.filepaths[0]))

Now let's transcribe our new data, setting the default confidence estimator.

In [None]:
confidence_cfg = ConfidenceConfig(
    preserve_word_confidence=True,
    preserve_token_confidence=True,
)

model.change_decoding_strategy(
    RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy_batch", confidence_cfg=confidence_cfg)
    if is_rnnt
    else CTCDecodingConfig(confidence_cfg=confidence_cfg)
)

noise_transcriptions = model.transcribe(
    audio=noise_data.filepaths, batch_size=4, return_hypotheses=True, num_workers=4
)
if is_rnnt:
    noise_transcriptions = noise_transcriptions[0]

On a fully non-speech dataset, hallucinations can be measured as the Word Insertions per Second (WIS) value.

In [None]:
def word_insertions_per_second(texts: List[str], durations: List[float]):
    """Calculate the Word Insertions per Second (WIS) value for the given recognition results 
    and their corresponding audio duration.
    """
    assert len(texts) == len(durations)

    wis_per_utt = [len(text.split(" ")) / duration for text, duration in zip(texts, durations)]
    return sum(wis_per_utt) / len(wis_per_utt), wis_per_utt

wis, wis_per_utt = word_insertions_per_second([t.text for t in noise_transcriptions], noise_data.durations)
print(f"Original Word Insertions per Second: {wis:.5f}")

Now, the ability of a confidence estimator to detect hallucinations is computed as the Hallucination Detection Rate (HDR).

It shows how many of all hallucinations can be removed, provided that no more than some fixed percentage of correct words are erroneously removed (under normal recognition conditions).

HDR is another name of the metric $\mathrm{TNR}_{FNR=e}$ which is calculated as $\mathrm{TNR}(Y,\tau): \mathrm{FNR}(X,\tau) \approx e$, where $X$ is the dataset with supervision (to tune $\tau$) and $Y$ is the noise-only dataset. Typical $e$ value is 0.05.

Let's compute HDR and the new WIS.

The generated dataset is clearly distinct from speech, so $e=0.01$ is sufficient.

In [None]:
def hdr(
    y_true_speech: Union[List[int], np.ndarray],
    y_score_speech: Union[List[float], np.ndarray],
    y_score_noise: Union[List[float], np.ndarray],
    max_fnr: float = 0.05,
    n_bins: int = 100,
) -> Tuple[float, float]:
    """Compute Hallucination Detection Rate (HDR) from prediction scores.

    Returns:
        tnr: True-Negateve Rate for HDR
        threshold_hdr: Optomal threshold 
    """
    y_true_speech = np.array(y_true_speech)
    y_score_speech = np.array(y_score_speech)
    y_score_noise = np.array(y_score_noise)
    thresholds = np.linspace(0, 1, n_bins + 1)
    assert y_true_speech.shape[0] == y_score_speech.shape[0]
    assert np.all(y_true_speech >= 0) and np.all(y_true_speech <= 1)
    if np.all(y_true_speech == 0) or np.all(y_true_speech == 1):
        return 0.0, 0.0
    mask_correct = y_true_speech == 1
    count_correct = max(mask_correct.nonzero()[0].shape[0], 1)
    y_score_correct = y_score_speech[mask_correct]
    threshold_hdr = 0.0
    for threshold in thresholds:
        fnr = (y_score_correct < threshold).nonzero()[0].shape[0] / count_correct
        if fnr <= max_fnr:
            threshold_hdr = threshold
        else:
            break
    tnr = (y_score_noise < threshold_hdr).nonzero()[0].shape[0] / y_score_noise.shape[0]
    return tnr, threshold_hdr


# e
max_fnr = 0.01
# Adding a check for cases with apostrophes
transcriptions = [h for h in transcriptions if len(h.word_confidence)==len(h.text.split())]
correct_marks = [
    mark for r, h in zip(current_test_set.reference_texts, transcriptions) for mark in get_correct_marks(r.split(), h.words)
]
y_score_speech = [w for h in transcriptions for w in h.word_confidence]
y_score_noise = [w for h in noise_transcriptions for w in h.word_confidence]
hdr_score, threshold_hdr = hdr(correct_marks, y_score_speech, y_score_noise, max_fnr=max_fnr)
wis_new = wis - wis * hdr_score

hdr_score, wis_new
print(
    f"""    Hallucination Detection Rate for max_fnr={max_fnr} : {hdr_score:.5f}
    New Word Insertions Per Second: {wis_new:.5f}"""
)

Finally, let's print the noisy utterances to see if any more hallucinations persist.

In [None]:
sorted_wis_indices = sorted(enumerate(wis_per_utt), key=lambda x: x[1])[::-1]

show_dataset_with_confidence(
    indices=sorted_wis_indices,
    transcriptions=noise_transcriptions,
    test_set=noise_data,
    threshold=threshold_hdr,
    filepaths=noise_data.filepaths,
    html_show=is_colab,
    min_dur_to_show=0.0,
    utt_to_show=5,
)

# Summary
This tutorial covered the basics of ASR confidence estimation and two examples of using ASR word confidence: WER reduction and hallucinations removal.