Spaces:
Sleeping
Sleeping
from typing import Tuple | |
import numpy as np | |
from espnet2.sds.utils.utils import int2float | |
def handle_espnet_TTS_intelligibility( | |
TTS_audio_output: Tuple[int, np.ndarray], LLM_Output: str | |
) -> str: | |
""" | |
Compute and return Word Error Rate (WER) and Character Error Rate (CER) metrics | |
for multiple ASR systems (ESPnet, OWSM, Whisper) using the Versa library. | |
This function: | |
1. Imports the necessary metrics and setup functions from Versa. | |
2. Prepares configuration arguments for each ASR system (ESPnet, OWSM, Whisper). | |
3. Runs the Levenshtein-based WER/CER calculations on the provided TTS audio. | |
4. Returns a formatted string summarizing WER and CER results | |
for hypotheses produced | |
by each ASR system when transcribing the TTS audio, using | |
the LLM output as the reference text. | |
Args: | |
TTS_audio_output (Tuple[int, np.ndarray]): | |
A tuple consisting of: | |
- The first element (int): the frame rate of the audio. | |
- The second element (np.ndarray): | |
the audio signal (e.g., a NumPy array). | |
LLM_Output (str): | |
The reference text generated by the LLM, which serves as the ground truth | |
for evaluating the TTS audio. | |
Returns: | |
str: | |
A formatted string showing the WER and CER percentages | |
for ESPnet, OWSM, and Whisper. | |
Example: | |
ESPnet WER: 10.50 | |
ESPnet CER: 7.20 | |
OWSM WER: 11.30 | |
OWSM CER: 8.00 | |
Whisper WER: 9.25 | |
Whisper CER: 6.50 | |
Raises: | |
ImportError: | |
If the Versa library is not installed or cannot be imported. | |
Example: | |
>>> tts_audio_output = (16000, audio_array) | |
>>> llm_output = "This is the reference text for evaluation." | |
>>> result = handle_espnet_TTS_intelligibility(tts_audio_output, llm_output) | |
>>> print(result) | |
ESPnet WER: 10.50 | |
ESPnet CER: 7.20 | |
OWSM WER: 11.30 | |
OWSM CER: 8.00 | |
Whisper WER: 9.25 | |
Whisper CER: 6.50 | |
""" | |
try: | |
from versa import ( | |
espnet_levenshtein_metric, | |
espnet_wer_setup, | |
owsm_levenshtein_metric, | |
owsm_wer_setup, | |
whisper_levenshtein_metric, | |
whisper_wer_setup, | |
) | |
except Exception as e: | |
print("Error: Versa is not properly installed.") | |
raise e | |
score_modules_espnet = { | |
"module": espnet_levenshtein_metric, | |
"args": espnet_wer_setup( | |
model_tag="default", | |
beam_size=1, | |
text_cleaner="whisper_en", | |
use_gpu=True, | |
), | |
} | |
dict1 = score_modules_espnet["module"]( | |
score_modules_espnet["args"], | |
int2float(TTS_audio_output[1]), | |
LLM_Output, | |
TTS_audio_output[0], | |
) | |
espnet_wer = ( | |
dict1["espnet_wer_delete"] | |
+ dict1["espnet_wer_insert"] | |
+ dict1["espnet_wer_replace"] | |
) / ( | |
dict1["espnet_wer_delete"] | |
+ dict1["espnet_wer_replace"] | |
+ dict1["espnet_wer_equal"] | |
) | |
espnet_cer = ( | |
dict1["espnet_cer_delete"] | |
+ dict1["espnet_cer_insert"] | |
+ dict1["espnet_cer_replace"] | |
) / ( | |
dict1["espnet_cer_delete"] | |
+ dict1["espnet_cer_replace"] | |
+ dict1["espnet_cer_equal"] | |
) | |
score_modules_owsm = { | |
"module": owsm_levenshtein_metric, | |
"args": owsm_wer_setup( | |
model_tag="default", | |
beam_size=1, | |
text_cleaner="whisper_en", | |
use_gpu=True, | |
), | |
} | |
dict1 = score_modules_owsm["module"]( | |
score_modules_owsm["args"], | |
int2float(TTS_audio_output[1]), | |
LLM_Output, | |
TTS_audio_output[0], | |
) | |
owsm_wer = ( | |
dict1["owsm_wer_delete"] + dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"] | |
) / (dict1["owsm_wer_delete"] + dict1["owsm_wer_replace"] + dict1["owsm_wer_equal"]) | |
owsm_cer = ( | |
dict1["owsm_cer_delete"] + dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"] | |
) / (dict1["owsm_cer_delete"] + dict1["owsm_cer_replace"] + dict1["owsm_cer_equal"]) | |
score_modules_whisper = { | |
"module": whisper_levenshtein_metric, | |
"args": whisper_wer_setup( | |
model_tag="default", | |
beam_size=1, | |
text_cleaner="whisper_en", | |
use_gpu=True, | |
), | |
} | |
dict1 = score_modules_whisper["module"]( | |
score_modules_whisper["args"], | |
int2float(TTS_audio_output[1]), | |
LLM_Output, | |
TTS_audio_output[0], | |
) | |
whisper_wer = ( | |
dict1["whisper_wer_delete"] | |
+ dict1["whisper_wer_insert"] | |
+ dict1["whisper_wer_replace"] | |
) / ( | |
dict1["whisper_wer_delete"] | |
+ dict1["whisper_wer_replace"] | |
+ dict1["whisper_wer_equal"] | |
) | |
whisper_cer = ( | |
dict1["whisper_cer_delete"] | |
+ dict1["whisper_cer_insert"] | |
+ dict1["whisper_cer_replace"] | |
) / ( | |
dict1["whisper_cer_delete"] | |
+ dict1["whisper_cer_replace"] | |
+ dict1["whisper_cer_equal"] | |
) | |
return ( | |
f"ESPnet WER: {espnet_wer*100:.2f}\n" | |
f"ESPnet CER: {espnet_cer*100:.2f}\n" | |
f"OWSM WER: {owsm_wer*100:.2f}\n" | |
f"OWSM CER: {owsm_cer*100:.2f}\n" | |
f"Whisper WER: {whisper_wer*100:.2f}\n" | |
f"Whisper CER: {whisper_cer*100:.2f}" | |
) | |