Spaces:
Sleeping
Sleeping
File size: 5,451 Bytes
b9a6dd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
from typing import Tuple
import numpy as np
from espnet2.sds.utils.utils import int2float
def handle_espnet_TTS_intelligibility(
TTS_audio_output: Tuple[int, np.ndarray], LLM_Output: str
) -> str:
"""
Compute and return Word Error Rate (WER) and Character Error Rate (CER) metrics
for multiple ASR systems (ESPnet, OWSM, Whisper) using the Versa library.
This function:
1. Imports the necessary metrics and setup functions from Versa.
2. Prepares configuration arguments for each ASR system (ESPnet, OWSM, Whisper).
3. Runs the Levenshtein-based WER/CER calculations on the provided TTS audio.
4. Returns a formatted string summarizing WER and CER results
for hypotheses produced
by each ASR system when transcribing the TTS audio, using
the LLM output as the reference text.
Args:
TTS_audio_output (Tuple[int, np.ndarray]):
A tuple consisting of:
- The first element (int): the frame rate of the audio.
- The second element (np.ndarray):
the audio signal (e.g., a NumPy array).
LLM_Output (str):
The reference text generated by the LLM, which serves as the ground truth
for evaluating the TTS audio.
Returns:
str:
A formatted string showing the WER and CER percentages
for ESPnet, OWSM, and Whisper.
Example:
ESPnet WER: 10.50
ESPnet CER: 7.20
OWSM WER: 11.30
OWSM CER: 8.00
Whisper WER: 9.25
Whisper CER: 6.50
Raises:
ImportError:
If the Versa library is not installed or cannot be imported.
Example:
>>> tts_audio_output = (16000, audio_array)
>>> llm_output = "This is the reference text for evaluation."
>>> result = handle_espnet_TTS_intelligibility(tts_audio_output, llm_output)
>>> print(result)
ESPnet WER: 10.50
ESPnet CER: 7.20
OWSM WER: 11.30
OWSM CER: 8.00
Whisper WER: 9.25
Whisper CER: 6.50
"""
try:
from versa import (
espnet_levenshtein_metric,
espnet_wer_setup,
owsm_levenshtein_metric,
owsm_wer_setup,
whisper_levenshtein_metric,
whisper_wer_setup,
)
except Exception as e:
print("Error: Versa is not properly installed.")
raise e
score_modules_espnet = {
"module": espnet_levenshtein_metric,
"args": espnet_wer_setup(
model_tag="default",
beam_size=1,
text_cleaner="whisper_en",
use_gpu=True,
),
}
dict1 = score_modules_espnet["module"](
score_modules_espnet["args"],
int2float(TTS_audio_output[1]),
LLM_Output,
TTS_audio_output[0],
)
espnet_wer = (
dict1["espnet_wer_delete"]
+ dict1["espnet_wer_insert"]
+ dict1["espnet_wer_replace"]
) / (
dict1["espnet_wer_delete"]
+ dict1["espnet_wer_replace"]
+ dict1["espnet_wer_equal"]
)
espnet_cer = (
dict1["espnet_cer_delete"]
+ dict1["espnet_cer_insert"]
+ dict1["espnet_cer_replace"]
) / (
dict1["espnet_cer_delete"]
+ dict1["espnet_cer_replace"]
+ dict1["espnet_cer_equal"]
)
score_modules_owsm = {
"module": owsm_levenshtein_metric,
"args": owsm_wer_setup(
model_tag="default",
beam_size=1,
text_cleaner="whisper_en",
use_gpu=True,
),
}
dict1 = score_modules_owsm["module"](
score_modules_owsm["args"],
int2float(TTS_audio_output[1]),
LLM_Output,
TTS_audio_output[0],
)
owsm_wer = (
dict1["owsm_wer_delete"] + dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"]
) / (dict1["owsm_wer_delete"] + dict1["owsm_wer_replace"] + dict1["owsm_wer_equal"])
owsm_cer = (
dict1["owsm_cer_delete"] + dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"]
) / (dict1["owsm_cer_delete"] + dict1["owsm_cer_replace"] + dict1["owsm_cer_equal"])
score_modules_whisper = {
"module": whisper_levenshtein_metric,
"args": whisper_wer_setup(
model_tag="default",
beam_size=1,
text_cleaner="whisper_en",
use_gpu=True,
),
}
dict1 = score_modules_whisper["module"](
score_modules_whisper["args"],
int2float(TTS_audio_output[1]),
LLM_Output,
TTS_audio_output[0],
)
whisper_wer = (
dict1["whisper_wer_delete"]
+ dict1["whisper_wer_insert"]
+ dict1["whisper_wer_replace"]
) / (
dict1["whisper_wer_delete"]
+ dict1["whisper_wer_replace"]
+ dict1["whisper_wer_equal"]
)
whisper_cer = (
dict1["whisper_cer_delete"]
+ dict1["whisper_cer_insert"]
+ dict1["whisper_cer_replace"]
) / (
dict1["whisper_cer_delete"]
+ dict1["whisper_cer_replace"]
+ dict1["whisper_cer_equal"]
)
return (
f"ESPnet WER: {espnet_wer*100:.2f}\n"
f"ESPnet CER: {espnet_cer*100:.2f}\n"
f"OWSM WER: {owsm_wer*100:.2f}\n"
f"OWSM CER: {owsm_cer*100:.2f}\n"
f"Whisper WER: {whisper_wer*100:.2f}\n"
f"Whisper CER: {whisper_cer*100:.2f}"
)
|